support <- read_tsv(here::here("NYGC_ALS_RNA_seq_support.tsv")) %>% as.data.frame()
library_sizes <- read_tsv(here::here("STMN2-pipeline/results/all_nygc_nov_2019/all_nygc_nov_2019_library_sizes.tsv"),col_names = c("sample", "library_size") ) %>%
mutate( sample = gsub("data/metrics/", "", sample) )
support <-
support %>%
# impute missing values
mutate( rin = replace_na(rin, median(rin, na.rm = TRUE)),
age = replace_na(age, median(age, na.rm = TRUE)),
pmi = replace_na(pmi, median(pmi, na.rm = TRUE))) %>%
left_join(library_sizes, by = "sample")
## fix disease == "Other" samples
# if disease_full contains ALS and FTD then mark as ALS-FTD
# assume sample with ALS and Dementia (NOS) is ALS-FTD
support <-
support %>%
mutate( disease = case_when(
disease == "Other" & grepl("ALS", disease_full) & grepl("FTD|NOS", disease_full) ~ "ALS-FTD",
disease == "Other" & grepl("ALS", disease_full) & !grepl("FTD|NOS", disease_full) ~ "ALS",
TRUE ~ disease
))
support <-
support %>%
mutate( pathology = case_when(
disease == "Control" ~ "Control",
disease == "FTD" & mutations == "C9orf72" ~ "FTD-C9",
disease == "FTD" & mutations != "C9orf72" & ( is.na(pathology) | grepl("TDP", pathology) ) ~ "FTD-TDP",
mutations == c("SOD1", "ANG") ~ "ALS-other",
mutations == "C9orf72" & disease == "ALS" ~ "ALS-C9",
mutations != "C9orf72" & disease == "ALS" ~ "ALS",
disease == "ALS-FTD" ~ "ALS-FTD",
disease == "ALS-AD" ~ "ALS-AD",
disease == "Other" ~ "Other",
TRUE ~ pathology
)) %>%
filter( ! tissue %in% c("Cell_Line", "Liver")) %>% # ignored these
mutate(tissue = gsub("_"," ", tissue)) %>%
#filter(tissue %in% c("Cerebellum", "Frontal Cortex", "Temporal Cortex", "Lumbar Spinal Cord", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex", "Occipital Cortex") ) %>%
# mutate( tissue = factor(tissue,levels = c("Cerebellum", "Frontal Cortex", "Temporal Cortex", "Lumbar Spinal Cord", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex", "Occipital Cortex") ) )%>%
filter(!duplicated(sample)) # remove single duplicated sample - CGND-HRA-00431
#table(support$disease, support$pathology)
# categorise into and ALS, FTD, ALS-FTD and +/- TDP-43
support <-
support %>%
mutate( tdp_status = case_when(
disease == "Control" ~ "TDP-43 negative",
pathology %in% c("ALS", "ALS-C9", "ALS-FTD", "ALS-AD", "FTD-C9", "FTD-TDP") ~ "TDP-43 positive",
pathology %in% c("ALS-other","FTD-TAU", "FTD-FUS") ~ "TDP-43 negative",
TRUE ~ "unknown"
) ) %>%
mutate( disease_tdp = case_when(
disease == "ALS" & tdp_status == "TDP-43 positive" ~ "ALS",
disease == "ALS" & tdp_status == "TDP-43 negative" ~ "ALS-other",
disease == "FTD" & tdp_status == "TDP-43 positive" ~ "FTD-TDP",
disease == "FTD" & tdp_status == "TDP-43 negative" ~ "FTD-other",
TRUE ~ pathology
))
#table(support$disease, support$tdp_status)
#table(support$pathology, support$tdp_status)
rownames(support) <- support$sample
table(support$pathology)
##
## ALS ALS-AD ALS-C9 ALS-FTD ALS-other Control FTD-C9
## 565 64 86 92 16 183 45
## FTD-FUS FTD-TAU FTD-TDP Other
## 12 19 81 60
table(support$tissue, support$disease_tdp) %>%
knitr::kable() %>%
kableExtra::kable_styling()
| ALS | ALS-AD | ALS-FTD | ALS-other | Control | FTD-other | FTD-TDP | Other | |
|---|---|---|---|---|---|---|---|---|
| Cerebellum | 140 | 10 | 17 | 6 | 31 | 11 | 50 | 10 |
| Cervical Spinal Cord | 97 | 9 | 13 | 0 | 27 | 0 | 1 | 8 |
| Frontal Cortex | 96 | 10 | 17 | 4 | 37 | 12 | 37 | 9 |
| Hippocampus | 7 | 0 | 3 | 0 | 0 | 0 | 0 | 1 |
| Lateral Motor Cortex | 46 | 6 | 6 | 2 | 9 | 0 | 1 | 5 |
| Lumbar Spinal Cord | 82 | 9 | 12 | 1 | 27 | 0 | 1 | 9 |
| Medial Motor Cortex | 52 | 5 | 5 | 0 | 11 | 0 | 1 | 7 |
| Occipital Cortex | 42 | 7 | 6 | 0 | 6 | 0 | 1 | 4 |
| Other | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Sensory Cortex | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Temporal Cortex | 22 | 3 | 3 | 0 | 26 | 8 | 33 | 2 |
| Thoracic Spinal Cord | 38 | 5 | 6 | 3 | 8 | 0 | 1 | 4 |
| Unspecified Motor Cortex | 26 | 0 | 4 | 0 | 1 | 0 | 0 | 1 |
For all samples used here, get junctions and cluster with Leafcutter. Can I find evidence of TDP-43 linked splicing events? Do they correlate with cellular composition?
STMN2 is both a neuronal marker and TDP-43 splicing target. Clustering junctions with leafcutter allows the quantification of both STMN2 expression and STMN2 mis-splicing. Do both of these variables correlate with cell-type composition?
# all CNS samples currently available
all_junctions <-
read.table(file = here::here("STMN2-pipeline/results/all_nygc_nov_2019/all_nygc_nov_2019_perind_numers.counts.gz"), header = TRUE, stringsAsFactors = FALSE)
meta <- leafcutter::get_intron_meta(row.names(all_junctions))
# find cluster containing novel junction
stmn2_clu <- filter(meta, chr == 'chr8', start >= 79611214, end <= 79636802) %>%
pull(clu) %>%
unique()
stmn2 <- all_junctions[ meta$clu == stmn2_clu, ]
stmn2 <- rownames_to_column(stmn2, var = "junction")
# reshape the table
stmn2 <-
tidyr::gather(stmn2, key = "sample", value = "counts", -junction) %>%
# tidyr::spread(key = junction, value = counts) %>%
mutate(sample = gsub(".Aligned.Quality.Sorted.bam", "", sample, fixed = TRUE) ) %>%
mutate(sample = gsub(".", "-", sample, fixed = TRUE)) %>%
inner_join(support, by = "sample")
# annotate junctions
stmn2_meta <- leafcutter::get_intron_meta(stmn2$junction) %>%
mutate( anno = case_when( start ==79611214 & end == 79616822 ~ "novel_5SS",
start == 79611214 & end == 79636802 ~ "major",
start == 79611791 ~ "novel 3SS A",
start == 79611433 ~ "novel 3SS B"))
stmn2$junction_type <- stmn2_meta$anno
# library size is in total reads
# divide by 2 to get read pairs
# divide counts by total read pairs to normalise
# times by 1e6 get per million
stmn2$counts_tpm <- (stmn2$counts / (stmn2$library_size /2 ) ) * 1e6
# only plot tissues with 20+ samples
tissues_to_plot <-
group_by(support, tissue ) %>% tally() %>%
filter(n >= 20) %>%
pull(tissue)
# for now exclude "other" and "other FTD" categories
stmn2_als_ftd <-
stmn2 %>%
filter(pathology != "Other" ) %>%
mutate( pathology = forcats::fct_relevel(pathology, "Control")) %>%
mutate( disease = forcats::fct_relevel(disease, "Control")) %>%
filter(tissue %in% tissues_to_plot) %>%
mutate( tissue = gsub("_", " ", tissue )) %>%
mutate(tissue = factor(tissue, levels = c("Frontal Cortex", "Temporal Cortex", "Cerebellum", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex", "Occipital Cortex") ))
Make plots
Plot normalised junction counts of major junction and novel junction
# diagnostic plot
stmn2_als_ftd %>%
filter( junction_type %in% c("major") ) %>%
filter( disease %in% c("Control", "ALS", "FTD")) %>%
filter(tissue %in% c("Frontal Cortex", "Cerebellum")) %>%
#filter( platform %in% c("HiSeq 2500", "NovaSeq")) %>%
ggplot(aes( x = disease, y = (counts_tpm), group = paste(disease, prep) )) +
geom_point(width = 0.05, aes(colour = prep), size = 0.5, alpha = 0.9, position = position_jitterdodge(jitter.width = 0.2, jitter.height = 0) ) +
facet_wrap(~tissue, ncol = 2) +
labs(y = "junction counts per million", x = "", title = "STMN2 canonical junction", subtitle = "Effect of library prep method on counts") +
scale_alpha_manual(values = c(1,0.1) ) +
#scale_colour_manual(values = c("black", "gray")) +
#guides(colour = FALSE, alpha = FALSE) +
theme_jh() +
geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
theme(panel.border = element_rect(fill =NA))
## Warning: Ignoring unknown parameters: width
There is an effect of library prep interacting with disease status on the STMN2 major junction.
stmn2_als_ftd %>%
filter( junction_type %in% c("major") ) %>%
filter( disease %in% c("Control", "ALS", "FTD", "ALS-FTD")) %>%
filter(tissue %in% c("Frontal Cortex", "Cerebellum")) %>%
filter( !is.na(prep) ) %>%
ggplot(aes( x = rin, y = (counts_tpm) )) +
geom_point(aes(colour = disease), size = 1) +
#geom_jitter(width = 0.05, aes(colour = platform), size = 0.5, alpha = 0.9) +
facet_wrap(~ prep + tissue, ncol = 3) +
labs(y = "junction counts per million", x = "", title = "STMN2 canonical junction correlates with RIN in a prep-specific effect") +
scale_alpha_manual(values = c(1,0.1)) +
#scale_colour_manual(values = c("black", "gray")) +
#guides(colour = FALSE, alpha = FALSE) +
theme_jh() +
#geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
theme(panel.border = element_rect(fill =NA)) +
ggpubr::stat_cor()
# levels in all samples - HiSeq
stmn2_als_ftd %>%
filter( junction_type %in% c("major") ) %>%
filter(platform == "HiSeq 2500") %>%
filter(tissue != "Temporal Cortex") %>%
ggplot(aes( x = forcats::fct_rev(pathology), y = (counts_tpm) )) +
geom_jitter(width = 0.05, aes(colour = platform), size = 0.5, alpha = 0.9) +
facet_wrap(~ tissue, ncol = 3) +
coord_flip() +
labs(y = "junction counts per million", x = "", title = "STMN2 canonical junction", subtitle = "HiSeq samples") +
scale_alpha_manual(values = c(1,0.1)) +
#scale_colour_manual(values = c("black", "gray")) +
guides(colour = FALSE, alpha = FALSE) +
theme_jh() +
geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
theme(panel.border = element_rect(fill =NA))
stmn2_als_ftd %>%
filter( junction_type %in% c("major") ) %>%
filter(platform == "NovaSeq") %>%
filter( tissue %in% c("Frontal Cortex", "Temporal Cortex", "Cerebellum", "Cervical Spinal Cord", "Lumbar Spinal Cord") ) %>%
#filter(tissue != "Temporal Cortex") %>%
ggplot(aes( x = forcats::fct_rev(pathology), y = (counts_tpm) )) +
geom_jitter(width = 0.05, colour = "black", size = 0.5, alpha = 0.9) +
facet_wrap(~ tissue, ncol = 3) +
coord_flip() +
labs(y = "junction counts per million", x = "", title = "STMN2 canonical junction", subtitle = "NovaSeq samples") +
scale_alpha_manual(values = c(1,0.1)) +
#scale_colour_manual(values = c("black", "gray")) +
guides(colour = FALSE, alpha = FALSE) +
theme_jh() +
geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
theme(panel.border = element_rect(fill =NA))
There is clearly an effect of library prep and/or sequencing platform on the quantification of STMN2 splicing. This is confounding by the fact that most ALS samples were sequenced in one batch while most of the FTD samples were sequenced in the other batch. However, some trends emerge. ALS samples have consistently higher levels of STMN2 major isoform than controls in the frontal cortex, whereas FTD samples have lower levels of STMN2 in the frontal and temporal cortices.
# # expression of major isoform
# stmn2_major_plot <- stmn2 %>%
# left_join(stmn_rsem_voom, by = "sample") %>%
# filter( junction_type %in% c("major isoform") ) %>%
# ggplot(aes( x = forcats::fct_rev(pathology), y = stmn2_expression )) +
# geom_jitter(width = 0.1, colour = "black", size = 0.5, alpha = 0.9) +
# facet_wrap(~ tissue, ncol = 3, scale = "free_y") +
# coord_flip() +
# labs(y = "normalised gene counts", x = "", title = "STMN2 canonical isoform") +
# scale_alpha_manual(values = c(1,0.1)) +
# #scale_colour_manual(values = c("black", "gray")) +
# guides(colour = FALSE, alpha = FALSE) +
# theme_jh() +
# geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
# theme_jh() +
# theme(panel.border = element_rect(fill = NA), axis.line = element_blank())
# # expression of major isoform
# novaseq_only_stmn2_major_plot <- stmn2 %>%
# left_join(stmn_rsem_voom, by = "sample") %>%
# filter( junction_type %in% c("major isoform") ) %>%
# filter(platform == "NovaSeq") %>%
# ggplot(aes( x = forcats::fct_rev(pathology), y = stmn2_expression )) +
# geom_jitter(aes(colour = platform), width = 0.1, size = 0.5, alpha = 0.9) +
# facet_wrap(~ tissue, ncol = 3, scale = "free_y") +
# coord_flip() +
# labs(y = "normalised gene counts", x = "", title = "STMN2 canonical isoform - NovaSeq samples only") +
# scale_alpha_manual(values = c(1,0.1)) +
# #scale_colour_manual(values = c("black", "gray")) +
# guides(colour = FALSE, alpha = FALSE) +
# theme_jh() +
# geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
# theme_jh() +
# theme(panel.border = element_rect(fill = NA), axis.line = element_blank())
#
#
# stmn2_major_plot
#
# ggsave(stmn2_major_plot, filename = "plots/STMN2/stmn2_all_samples_major_isoform.pdf", width = 9, height = 7, device = cairo_pdf)
ALS samples have higher STMN2 expression - is this due to batch effect of controls being mostly done on NovaSeq?
Plot counts of STMN2 novel junction.
# plot counts
stmn2_als_ftd %>%
filter( junction_type %in% c("novel_5SS")) %>%
ggplot(aes( x = forcats::fct_rev(pathology), y = (counts_tpm ) )) +
geom_jitter(width = 0.5, aes(colour = pathology, alpha = counts_tpm == 0), size =1) +
facet_wrap(~ tissue, ncol = 5, scales = "free") +
coord_flip() +
labs(y = "counts per million", x = "", title = "STMN2 novel isoform") +
scale_alpha_manual(values = c(1,0.1)) +
#scale_colour_manual(values = c("black", "gray")) +
guides(colour = FALSE, alpha = FALSE) +
theme_jh() +
theme(panel.border = element_rect(fill = NA), axis.line = element_blank())
# plot counts
stmn2 %>%
filter( junction_type %in% c("novel_5SS")) %>%
ggplot(aes( x = forcats::fct_rev(pathology), y = (counts ) )) +
geom_jitter(width = 0.2, height = 0, aes(colour = pathology, alpha = counts_tpm == 0), size =1) +
facet_wrap(~ tissue, ncol = 5, scales = "free") +
coord_flip() +
labs(y = "raw junction counts", x = "", title = "STMN2 novel isoform") +
scale_alpha_manual(values = c(1,0.1)) +
#scale_colour_manual(values = c("black", "gray")) +
guides(colour = FALSE, alpha = FALSE) +
theme_jh() +
theme(panel.border = element_rect(fill = NA), axis.line = element_blank())
Treat as binary outcome
Try to plot multiple thresholds simultaneously eg >= 1, >=2, >=5 in the same chart maybe with patchwork?
# # treat as binary - counts > 0
# stmn2_proportions_1 <-
# stmn2 %>%
# filter( junction_type %in% c("novel_5SS")) %>%
# filter( tissue %in% tissues_to_plot) %>%
# mutate( pathology = forcats::fct_relevel(pathology, "Control")) %>%
# mutate(stmn2_status = counts > 0 ) %>%
# group_by( pathology, tissue ) %>%
# summarise( total = n(), stmn2_positive = sum(stmn2_status) ) %>%
# mutate( prop = stmn2_positive / total ) %>%
# mutate(stmn2_positive = as.character(stmn2_positive)) %>%
# mutate(stmn2_positive = ifelse( stmn2_positive == "0", "", stmn2_positive))
#
#
# stmn2_proportions_2 <-
# stmn2 %>%
# filter( junction_type %in% c("novel_5SS")) %>%
# filter( tissue %in% tissues_to_plot) %>%
# mutate( pathology = forcats::fct_relevel(pathology, "Control")) %>%
# mutate(stmn2_status = counts > 1 ) %>%
# group_by( pathology, tissue ) %>%
# summarise( total = n(), stmn2_positive = sum(stmn2_status) ) %>%
# mutate( prop = stmn2_positive / total ) %>%
# mutate(stmn2_positive = as.character(stmn2_positive)) %>%
# mutate(stmn2_positive = ifelse( stmn2_positive == "0", "", stmn2_positive))
#
#
# stmn2_prop_plot_1 <-
# stmn2_proportions_1 %>%
# ggplot(aes( x = forcats::fct_rev(pathology), y = prop )) +
# geom_col() +
# geom_text(aes(label = total), y = 1.05, size = 3) +
# geom_text(data = filter(stmn2_proportions_1, prop < 0.9), aes(label = stmn2_positive ), nudge_y = 0.05, size = 3) +
# geom_text(data = filter(stmn2_proportions_1, prop >= 0.9), aes(label = stmn2_positive ), nudge_y = -0.05, size = 3, colour = "white") +
# scale_y_continuous(expand = c(0,0), limits = c(0,1.1),labels = scales::percent ) +
# #geom_errorbar(aes(ymin = prop, ymax = binom_conf_upper)) +
# facet_wrap(~tissue, scales = "free_y") +
# geom_hline(yintercept = 1, linetype = 3) +
# coord_flip() +
# labs(y = "% of samples with 1 or more novel STMN2 junction read", x = "", title = "STMN2 novel splicing across disease and tissue") +
# theme_jh() +
# theme( panel.border = element_rect(fill = NA), axis.line = element_blank() )
#
# tdp_status_support <- support %>%
# select(disease, pathology, tdp_status) %>%
# distinct()
#
#
# stmn2_prop_plot_2 <-
# stmn2_proportions_2 %>%
# #left_join(tdp_status_support, by = "pathology") %>%
# ggplot(aes( x = forcats::fct_rev(pathology), y = prop )) +
# geom_col() +
# geom_text(aes(label = total), y = 1.05, size = 3) +
# geom_text(data = filter(stmn2_proportions_2, prop < 0.9), aes(label = stmn2_positive ), nudge_y = 0.05, size = 3) +
# geom_text(data = filter(stmn2_proportions_2, prop >= 0.9), aes(label = stmn2_positive ), nudge_y = -0.05, size = 3, colour = "white") +
# scale_y_continuous(expand = c(0,0), limits = c(0,1.1),labels = scales::percent ) +
# #geom_errorbar(aes(ymin = prop, ymax = binom_conf_upper)) +
# facet_wrap(~tissue, scales = "free_y") +
# geom_hline(yintercept = 1, linetype = 3) +
# coord_flip() +
# labs(y = "% of samples with 2 or more novel STMN2 junction reads", x = "", title = "STMN2 novel splicing across disease and tissue") +
# theme_jh() +
# theme( panel.border = element_rect(fill = NA), axis.line = element_blank() )
#
#
# stmn2_prop_plot_1
# stmn2_prop_plot_2
Count samples at different read count thresholds
stmn2_novel <- filter(stmn2, junction_type %in% c("novel_5SS"))
stmn2_read_thresholds <-
stmn2_novel %>%
filter(tissue %in% c("Frontal Cortex", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Temporal Cortex", "Lumbar Spinal Cord", "Occipital Cortex", "Lateral Motor Cortex", "Medial Motor Cortex", "Cerebellum")) %>%
mutate( stmn2_1 = counts == 1, stmn2_2 = counts >=2 & counts < 5, stmn2_5 = counts >= 5) %>%
#group_by(tissue, pathology,TDP = tdp_status) %>%
group_by(tissue, disease_tdp) %>%
summarise( samples = n(), `1` = sum(stmn2_1) / n(), `2-4` = sum(stmn2_2) / n(), `5+` = sum(stmn2_5) / n() ) %>%
#tidyr::gather(key = "threshold", value = count, -tissue, -pathology, -TDP, -samples ) %>%
tidyr::gather(key = "threshold", value = count, -tissue, -disease_tdp, -samples ) %>%
ungroup() %>%
mutate( disease_tdp = forcats::fct_relevel(disease_tdp, "Control")) %>%
mutate( disease_tdp = forcats::fct_rev(disease_tdp)) %>%
mutate(tissue = factor(tissue, levels = c("Frontal Cortex", "Temporal Cortex", "Cerebellum", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex", "Occipital Cortex") ))
stmn2_threshold_plot <-
stmn2_read_thresholds %>%
ggplot(aes(x = disease_tdp, y = count ) ) +
geom_col(aes(alpha = threshold), fill = "blue", colour = "white") +
#facet_wrap(~Tissue) +
#coord_flip() +
geom_text(data = select(stmn2_read_thresholds, tissue, disease_tdp, samples) %>% distinct(), aes(label = samples), y = 1.05, size = 3) +
#geom_text(data = filter(stmn2_proportions_2, prop < 0.9), aes(label = stmn2_positive ), nudge_y = 0.05, size = 3) +
#geom_text(data = filter(stmn2_proportions_2, prop >= 0.9), aes(label = stmn2_positive ), nudge_y = -0.05, size = 3, colour = "white") +
scale_y_continuous(expand = c(0,0), limits = c(0,1.1),labels = scales::percent ) +
#geom_errorbar(aes(ymin = prop, ymax = binom_conf_upper)) +
facet_wrap(~tissue, scales = "free_y") +
geom_hline(yintercept = 1, linetype = 3) +
labs(x = "", y = "Proportion of samples with STMN2 junctions") +
coord_flip() +
#labs(y = "% of samples with 2 or more novel STMN2 junction reads", x = "", title = "STMN2 novel splicing across disease and tissue") +
theme_jh() +
scale_alpha_manual("read\nthreshold",values = c(0.3,0.6,1) ) +
theme( panel.border = element_rect(fill = NA), axis.line = element_blank() )
stmn2_threshold_plot
ggsave(plot = stmn2_threshold_plot, dpi = 600, width = 9, height = 6, filename = "../figures/STMN2_read_threshold_NYGC.png" )
# mutate( tdp_status = ifelse(tdp_status == "TDP-43 negative", yes = "-other", no = "-TDP")) %>%
# group_by(Tissue = tissue, Disease = disease,`TDP-43` = tdp_status) %>%
# summarise( samples = n(), `≥1` = sum(stmn2_1), `≥2` = sum(stmn2_2), `>5` = sum(stmn2_5) ) %>%
# knitr::kable(align = "c") %>%
# kableExtra::kable_styling(full_width = FALSE) %>%
# kableExtra::add_header_above(c(" " = 4, "Samples with STMN2 novel junction reads" = 3)) %>%
# kableExtra::column_spec(4, bold = TRUE) %>%
# kableExtra::collapse_rows(columns = 1:2)
stmn2_novel %>%
mutate( stmn2_1 = counts >= 1, stmn2_2 = counts >= 2, stmn2_5 = counts >= 5) %>%
mutate( tdp_status = ifelse(tdp_status == "TDP-43 negative", yes = "\\-", no = "\\+")) %>%
group_by(disease,tdp_status) %>%
summarise( samples = n(), `≥1` = sum(stmn2_1), `≥2` = sum(stmn2_2), `>5` = sum(stmn2_5) ) %>%
knitr::kable(align = "c") %>%
kableExtra::kable_styling(full_width = FALSE) %>%
kableExtra::add_header_above(c(" " = 3, "Samples with STMN2 novel junction reads" = 3)) %>%
kableExtra::column_spec(3, bold = TRUE)
| disease | tdp_status | samples | ≥1 | ≥2 | >5 |
|---|---|---|---|---|---|
| ALS | - | 15 | 0 | 0 | 0 |
| ALS | + | 650 | 208 | 154 | 80 |
| ALS-AD | + | 64 | 18 | 13 | 6 |
| ALS-FTD | + | 92 | 49 | 40 | 21 |
| Control | - | 183 | 5 | 0 | 0 |
| FTD | - | 31 | 1 | 0 | 0 |
| FTD | + | 126 | 64 | 51 | 37 |
| Other | - | 1 | 0 | 0 | 0 |
| Other | + | 60 | 6 | 1 | 1 |
# by tissue
stmn2_novel %>%
mutate( disease = forcats::fct_relevel(disease, "Control")) %>%
filter(tissue %in% c("Frontal Cortex", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Temporal Cortex", "Lumbar Spinal Cord", "Occipital Cortex", "Lateral Motor Cortex", "Medial Motor Cortex", "Cerebellum", "Unspecified Motor Cortex")) %>%
mutate( stmn2_1 = counts >= 1, stmn2_2 = counts >= 2, stmn2_5 = counts >= 5) %>%
mutate( tdp_status = ifelse(tdp_status == "TDP-43 negative", yes = "\\-", no = "\\+")) %>%
group_by(Tissue = tissue, Disease = disease,`TDP-43` = tdp_status) %>%
summarise( samples = n(), `≥1` = sum(stmn2_1), `≥2` = sum(stmn2_2), `>5` = sum(stmn2_5) ) %>%
knitr::kable(align = "c") %>%
kableExtra::kable_styling(full_width = FALSE) %>%
kableExtra::add_header_above(c(" " = 4, "Samples with STMN2 novel junction reads" = 3)) %>%
kableExtra::column_spec(4, bold = TRUE) %>%
kableExtra::collapse_rows(columns = 1:2)
| Tissue | Disease | TDP-43 | samples | ≥1 | ≥2 | >5 |
|---|---|---|---|---|---|---|
| Cerebellum | Control | - | 31 | 3 | 0 | 0 |
| ALS | - | 5 | 0 | 0 | 0 | |
| + | 140 | 4 | 1 | 0 | ||
| ALS-AD | + | 10 | 0 | 0 | 0 | |
| ALS-FTD | + | 17 | 2 | 0 | 0 | |
| FTD | - | 11 | 0 | 0 | 0 | |
| + | 50 | 1 | 0 | 0 | ||
| Other | - | 1 | 0 | 0 | 0 | |
| + | 10 | 1 | 0 | 0 | ||
| Cervical Spinal Cord | Control | - | 27 | 1 | 0 | 0 |
| ALS | + | 97 | 62 | 45 | 26 | |
| ALS-AD | + | 9 | 6 | 4 | 2 | |
| ALS-FTD | + | 13 | 10 | 9 | 4 | |
| FTD | + | 1 | 0 | 0 | 0 | |
| Other | + | 8 | 0 | 0 | 0 | |
| Frontal Cortex | Control | - | 37 | 1 | 0 | 0 |
| ALS | - | 4 | 0 | 0 | 0 | |
| + | 96 | 10 | 8 | 3 | ||
| ALS-AD | + | 10 | 0 | 0 | 0 | |
| ALS-FTD | + | 17 | 11 | 10 | 5 | |
| FTD | - | 12 | 0 | 0 | 0 | |
| + | 37 | 36 | 30 | 21 | ||
| Other | + | 9 | 1 | 0 | 0 | |
| Lateral Motor Cortex | Control | - | 9 | 0 | 0 | 0 |
| ALS | - | 2 | 0 | 0 | 0 | |
| + | 46 | 17 | 10 | 7 | ||
| ALS-AD | + | 6 | 1 | 1 | 1 | |
| ALS-FTD | + | 6 | 2 | 1 | 1 | |
| FTD | + | 1 | 0 | 0 | 0 | |
| Other | + | 5 | 1 | 0 | 0 | |
| Lumbar Spinal Cord | Control | - | 27 | 0 | 0 | 0 |
| ALS | - | 1 | 0 | 0 | 0 | |
| + | 82 | 56 | 46 | 31 | ||
| ALS-AD | + | 9 | 6 | 6 | 3 | |
| ALS-FTD | + | 12 | 10 | 8 | 6 | |
| FTD | + | 1 | 1 | 0 | 0 | |
| Other | + | 9 | 1 | 0 | 0 | |
| Medial Motor Cortex | Control | - | 11 | 0 | 0 | 0 |
| ALS | + | 52 | 16 | 13 | 3 | |
| ALS-AD | + | 5 | 3 | 1 | 0 | |
| ALS-FTD | + | 5 | 3 | 2 | 1 | |
| FTD | + | 1 | 0 | 0 | 0 | |
| Other | + | 7 | 1 | 1 | 1 | |
| Occipital Cortex | Control | - | 6 | 0 | 0 | 0 |
| ALS | + | 42 | 2 | 0 | 0 | |
| ALS-AD | + | 7 | 0 | 0 | 0 | |
| ALS-FTD | + | 6 | 0 | 0 | 0 | |
| FTD | + | 1 | 0 | 0 | 0 | |
| Other | + | 4 | 0 | 0 | 0 | |
| Temporal Cortex | Control | - | 26 | 0 | 0 | 0 |
| ALS | + | 22 | 7 | 4 | 2 | |
| ALS-AD | + | 3 | 0 | 0 | 0 | |
| ALS-FTD | + | 3 | 3 | 3 | 1 | |
| FTD | - | 8 | 1 | 0 | 0 | |
| + | 33 | 26 | 21 | 16 | ||
| Other | + | 2 | 1 | 0 | 0 | |
| Thoracic Spinal Cord | Control | - | 8 | 0 | 0 | 0 |
| ALS | - | 3 | 0 | 0 | 0 | |
| + | 38 | 19 | 14 | 5 | ||
| ALS-AD | + | 5 | 2 | 1 | 0 | |
| ALS-FTD | + | 6 | 2 | 2 | 1 | |
| FTD | + | 1 | 0 | 0 | 0 | |
| Other | + | 4 | 0 | 0 | 0 | |
| Unspecified Motor Cortex | Control | - | 1 | 0 | 0 | 0 |
| ALS | + | 26 | 14 | 12 | 3 | |
| ALS-FTD | + | 4 | 3 | 3 | 1 | |
| Other | + | 1 | 0 | 0 | 0 |
What are these “Other” samples that have STMN novel junctions?
stmn2_novel <- filter(stmn2, junction_type %in% c("novel_5SS"))
stmn2_novel_other <-
filter(stmn2_novel, disease == "Other")
stmn2_novel_other %>% filter(counts > 0 ) %>% arrange(desc(counts)) %>% select(sample, tissue, counts, disease_full)
## sample tissue counts
## 1 CGND-HRA-00568 Medial Motor Cortex 5
## 2 CGND-HRA-00404 Cerebellum 1
## 3 CGND-HRA-00792 Frontal Cortex 1
## 4 CGND-HRA-00793 Temporal Cortex 1
## 5 CGND-HRA-00796 Lumbar Spinal Cord 1
## 6 CGND-HRA-00569 Lateral Motor Cortex 1
## disease_full
## 1 Primary Lateral Sclerosis (PLS)
## 2 Multiple System Atrophy (MSA)
## 3 Progressive Bulbar Palsy
## 4 Progressive Bulbar Palsy
## 5 Progressive Bulbar Palsy
## 6 Primary Lateral Sclerosis (PLS)
all_counts <- stmn2_novel %>% arrange(desc(counts)) %>% select(sample, tissue, counts_tpm, disease_full) %>% pull(counts_tpm)
# hist(all_counts)
#
# table(all_counts > 0.01)
#
#
# table(stmn2_novel$disease, stmn2_novel$counts_tpm > 0)
# table(stmn2_novel$disease, stmn2_novel$counts_tpm > 0.005)
# table(stmn2_novel$disease, stmn2_novel$counts_tpm > 0.01)
#
# min(stmn2_novel$counts_tpm[ stmn2_novel$counts_tpm > 0])
# max(stmn2_novel$counts_tpm[ stmn2_novel$counts_tpm > 0])
#0.006 JPM = 0.006 reads per 1 million = 1 read per 166 million - 2 samples were sequenced at a depth of >160million reads.
# under a null model, the number of STMN2 junction reads should increase with library size
stmn2_novel %>%
filter( disease == "ALS", pathology != "ALS-SOD1") %>%
filter(tissue %in% tissues_to_plot) %>%
ggplot(aes(x =library_size, counts) ) + geom_point(aes(colour = counts == 0)) +
facet_wrap(~tissue) +
labs(title = "All non-SOD1 ALS samples - effect of library size on novel junction") +
theme_jh()
Some samples were sequenced at double the average sequencing depth. If the assumption is all non-SOD1 ALS samples should have TDP mislocalisation in the spinal cord, does doubling the sequencing depth improve my chance of detecting 1+ STMN2 reads? Yes. In the spinal cord samples, all but 1 of the samples with the high read depth has 1+ STMN2 read. This suggests the lack of completeness across the ALS samples is due to sequencing depth.
5 control samples have a single STMN2 read. 3 of them are Cerebellum. Does this mean let’s accept at least 2 reads as a minimum?
#with(stmn2_novel, table(pathology, true = counts > 1, tissue) )
stmn2_novel %>%
filter(disease == "Control") %>%
arrange(desc(counts)) %>%
head(10)
## junction sample counts individual
## 1 chr8:79611214:79616822:clu_1_- CGND-HRA-00049 1 NEUHC282LVJ
## 2 chr8:79611214:79616822:clu_1_- CGND-HRA-00593 1 NEUDA151YMK
## 3 chr8:79611214:79616822:clu_1_- CGND-HRA-01242 1 NEUEG442LDR
## 4 chr8:79611214:79616822:clu_1_- CGND-HRA-01146 1 PF-UCL-116
## 5 chr8:79611214:79616822:clu_1_- CGND-HRA-01149 1 PF-UCL-117
## 6 chr8:79611214:79616822:clu_1_- CGND-HRA-00591-2 0 NEUHA141EEC
## 7 chr8:79611214:79616822:clu_1_- CGND-HRA-00206 0 NEUHD481VCL
## 8 chr8:79611214:79616822:clu_1_- CGND-HRA-00212 0 NEUHD481VCL
## 9 chr8:79611214:79616822:clu_1_- CGND-HRA-00218 0 NEUHD481VCL
## 10 chr8:79611214:79616822:clu_1_- CGND-HRA-00224 0 NEUHD481VCL
## region tissue tissue_clean site rin pmi
## 1 Spinal_Cord Cervical Spinal Cord Cervical_Spinal_Cord UCSD 7.0 10
## 2 Cerebellum Cerebellum Cerebellum JHU 6.9 14
## 3 Cortex Frontal Cortex Frontal_Cortex JHU 7.5 6
## 4 Cerebellum Cerebellum Cerebellum UCL 4.6 13
## 5 Cerebellum Cerebellum Cerebellum UCL 2.7 13
## 6 Spinal_Cord Cervical Spinal Cord Cervical_Spinal_Cord JHU 5.3 14
## 7 Cerebellum Cerebellum Cerebellum CUMC 8.5 7
## 8 Cortex Frontal Cortex Frontal_Cortex CUMC 7.5 7
## 9 Cortex Medial Motor Cortex Motor_Cortex CUMC 5.7 7
## 10 Cortex Lateral Motor Cortex Motor_Cortex CUMC 6.5 7
## platform prep quote lane flowcell sex
## 1 HiSeq 2500 Manual KAPA RiboErase CGND_12856 L001 ACB20PANXX Female
## 2 HiSeq 2500 Manual KAPA RiboErase CGND_13066 L002 ACBRAPANXX Female
## 3 HiSeq 2500 Manual KAPA RiboErase CGND_13132 L002 ACC56PANXX Male
## 4 NovaSeq Automated KAPA Total CGND_13716 L001 HHGMFDSXX Male
## 5 NovaSeq Automated KAPA Total CGND_13716 L003 HHGMFDSXX Female
## 6 NovaSeq Automated KAPA Total CGND_13867 L001+L002 HJLM5DMXX Male
## 7 HiSeq 2500 Manual KAPA RiboErase CGND_12675 L004 BCAYDHANXX Female
## 8 HiSeq 2500 Manual KAPA RiboErase CGND_12675 L004 BCAYDHANXX Female
## 9 HiSeq 2500 Manual KAPA RiboErase CGND_12675 L004 BCAYDHANXX Female
## 10 HiSeq 2500 Manual KAPA RiboErase CGND_12675 L004 BCAYDHANXX Female
## disease disease_full age onset motor_onset mutations pathology
## 1 Control <NA> 68 NA Not Applicable None Control
## 2 Control <NA> 37 NA Not Applicable None Control
## 3 Control <NA> 22 NA Not Applicable None Control
## 4 Control <NA> 88 NA Not Applicable None Control
## 5 Control <NA> 80 NA Not Applicable None Control
## 6 Control <NA> 72 NA Not Applicable None Control
## 7 Control <NA> 54 NA Not Applicable None Control
## 8 Control <NA> 54 NA Not Applicable None Control
## 9 Control <NA> 54 NA Not Applicable None Control
## 10 Control <NA> 54 NA Not Applicable None Control
## library_size tdp_status disease_tdp junction_type counts_tpm
## 1 113619315 TDP-43 negative Control novel_5SS 0.01760264
## 2 77258265 TDP-43 negative Control novel_5SS 0.02588720
## 3 83524909 TDP-43 negative Control novel_5SS 0.02394495
## 4 88636750 TDP-43 negative Control novel_5SS 0.02256400
## 5 116170973 TDP-43 negative Control novel_5SS 0.01721600
## 6 82331600 TDP-43 negative Control novel_5SS 0.00000000
## 7 70994214 TDP-43 negative Control novel_5SS 0.00000000
## 8 88064237 TDP-43 negative Control novel_5SS 0.00000000
## 9 109710526 TDP-43 negative Control novel_5SS 0.00000000
## 10 89984464 TDP-43 negative Control novel_5SS 0.00000000
1 sample with “Other” has 5 STMN2 novel junction reads in the medial motor cortex and a single read in the lateral motor cortex. That individual had a diagnosis of Primary Lateral Sclerosis but died at 56 - potential ALS misdiagnosis?
Another patient has a single read in Frontal, Temporal and Lumbar Spinal Cord, suggestive of ALS-FTD. However this patient was diagnosed as having Progressive Bulbar Palsy and died at 84.
The final patient with an “Other” diagnosis died at 45 of MSA. However the junction was found in Cerebellum, which intriguingly has multiple samples with a single junction in.
Can this be modelled as a ratio of the two transcripts? For that I would want to assume that the two junctions are negatively correlated - when the amount of novel STMN2 goes up, the amount of canonical STMN2 goes down.
However this relationship only holds if the two isoforms are both widely expressed, which is probably not the case.
# plot correlations between levels
stmn2_wide_tpm <-
stmn2 %>%
select( sample, pathology, disease, counts_tpm, junction_type, tissue) %>%
tidyr::spread(key = junction_type, value = counts_tpm)
stmn2_wide_raw <-
stmn2 %>%
select( sample, pathology, disease, counts, junction_type, tissue) %>%
tidyr::spread(key = junction_type, value = counts)
# stmn2_wide %>%
# ggplot(aes(x = `major isoform`, y = `novel 3SS`) ) + geom_point(aes(colour = pathology) ) +
# theme_jh()
stmn2_wide_raw %>%
filter( novel_5SS > 1 ) %>%
filter(tissue %in% c("Frontal Cortex", "Temporal Cortex", "Cervical Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex","Medial Motor Cortex") ) %>%
ggplot(aes(x = major, y = novel_5SS) ) + geom_point(aes(colour = disease), size = 1 ) +
theme_jh() +
facet_wrap(~tissue) +
labs(x = "Canonical junction reads", y ="Novel junction reads") +
theme(panel.border = element_rect(fill = NA)) +
ggpubr::stat_cor(method = "spearman") +
labs(title = "Raw counts of the two STMN2 junctions have tissue specific correlation structure")
stmn2_wide_tpm %>%
filter( novel_5SS > 0.01 ) %>%
filter(tissue %in% c("Frontal Cortex", "Temporal Cortex", "Cervical Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex","Medial Motor Cortex") ) %>%
ggplot(aes(x = major, y = novel_5SS) ) + geom_point(aes(colour = disease), size = 1 ) +
theme_jh() +
facet_wrap(~tissue) +
labs(x = "Canonical junctions per million", y ="Novel junctions per million") +
theme(panel.border = element_rect(fill = NA)) +
ggpubr::stat_cor(method = "spearman") +
labs(title = "Normalised counts of the two STMN2 junctions have tissue specific correlation structure")
# does regressing out variation in major isoform remove novel isoform? probably not.
#
# stmn2_wide %>%
# ggplot(aes(x = `major isoform`, y = `novel 5SS`) ) + geom_point(aes(colour = as.factor(fcx_nmf_cluster) ) ) +
# theme_jh()
#
# stmn2_wide %>%
# ggplot(aes(y = `major isoform`, x = pathology) ) + geom_jitter(aes(colour = as.factor(fcx_nmf_cluster)), width = 0.25 ) +
# theme_jh()
#
# stmn2_wide %>%
# ggplot(aes(x = PC1, y = `major isoform`)) + geom_point(aes(colour = fcx_nmf_cluster))
#
# stmn2_wide %>%
# ggplot(aes(x = PC1, y = `novel 5SS`)) + geom_point(aes(colour = fcx_nmf_cluster))
Opposite correlations appear, with negative correaltions between STMN2 canonical isoform and the novel isoform in FTD samples in Frontal and Temporal Cortexes. However, There is a positive correlation between the two in ALS samples in the spinal cord. Both these phenomena persist when normalising by library size. In FTD this suggests that STMN2 novel splicing is associated with a reduction in canonical STMN2 expression whereas in ALS the two transcripts potentially act independently, both increasing with the proportion of neurons captured in a sample.
Estimate neuronal proportion in each sample and compare to STMN novel and canonical expression.
Consider collapsing Motor Cortex and Spinal Cord regions down?
# stmn2_novel %>%
# filter(region == "Spinal_Cord") %>%
# ggplot(aes(x = motor_onset, y = count))
stmn2_motor_onset <-
stmn2 %>%
filter( junction_type %in% c("novel_5SS")) %>%
filter(disease == "ALS", motor_onset %in% c("Limb", "Bulbar"), !mutations %in% c("SOD1", "ANG")) %>%
filter( tissue %in% c("Cervical Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex") ) %>%
mutate( stmn2_1 = counts == 1, stmn2_2 = counts >=2 & counts < 5, stmn2_5 = counts >= 5, stmn2_all = counts > 0) %>%
group_by(tissue, motor_onset ) %>%
summarise( samples = n(), all = sum(stmn2_all) / n(), `1` = sum(stmn2_1) / n(), `2-5` = sum(stmn2_2) / n(), `5+` = sum(stmn2_5) / n() ) %>%
tidyr::gather(key = "threshold", value = prop, -tissue, -motor_onset, -samples) %>%
ungroup()
# per tissue proportion test
stmn2_motor_onset %>%
filter(threshold == "all") %>%
mutate( x = prop * samples) %>%
split(.$tissue) %>%
purrr::map_df( ~{ tibble(p.value = prop.test(x = .x$x, n = .x$samples)$p.value ) }, .id = "tissue")
## Warning in prop.test(x = .x$x, n = .x$samples): Chi-squared approximation
## may be incorrect
## Warning in prop.test(x = .x$x, n = .x$samples): Chi-squared approximation
## may be incorrect
## # A tibble: 4 x 2
## tissue p.value
## <chr> <dbl>
## 1 Cervical Spinal Cord 0.821
## 2 Lateral Motor Cortex 0.585
## 3 Lumbar Spinal Cord 0.440
## 4 Medial Motor Cortex 1.000
# all tissues combined - proportion test
stmn2_motor_onset %>%
filter(threshold == "all") %>%
mutate( x = prop * samples) %>%
group_by( motor_onset ) %>%
summarise( n = sum(samples), x = sum(x)) %>%
with( prop.test(x =x , n = n) )
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: x out of n
## X-squared = 1.5573, df = 1, p-value = 0.2121
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.0482568 0.2470944
## sample estimates:
## prop 1 prop 2
## 0.6323529 0.5329341
stmn2_motor_onset %>%
filter(threshold != "all") %>%
ggplot(aes(x = motor_onset, y = prop)) +
geom_col(aes(alpha = threshold), colour = "white", fill = "blue") +
facet_wrap(~tissue) +
theme_jh() +
scale_alpha_manual("read\nthreshold",values = c(0.3,0.6,1) ) +
theme( panel.border = element_rect(fill = NA), axis.line = element_blank() ) +
labs(title = "STMN2 novel junctions in sporadic ALS patients", x = "Site of motor onset", y = "Percentage of samples") +
scale_y_continuous(expand = c(0,0), limits = c(0,1.1),labels = scales::percent )